import scrapy
from meishi.items import MeishiItem


class MeishijSpider(scrapy.Spider):
	name = 'meishij'
	allowed_domains = ['meishij.net']
	start_urls = ['https://www.meishij.net/china-food/caixi/other/']
	headers = {
	"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
	"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
	'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
	'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
	'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
	'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
	'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
	'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
	'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
	'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
	'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
	'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
	'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
	'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
	'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'
	}

	# 遍历一级目录
	def parse(self, response):
		typeOneList=response.xpath("//ul[@class='listnav_ul']//a")
		typeOneList.pop(0)
		typeOneList.pop()
		typeOneList.pop()
		# print(typeOneList.xpath("./text()").extract())

		for one in typeOneList:
			typeOneUrl=one.xpath("./@href").extract()[0]
			typeOneName=one.xpath("./text()").extract()[0]

			if typeOneUrl[0]=='/':
				typeOneUrl = "https://www.meishij.net"+typeOneUrl
			yield scrapy.Request(typeOneUrl,callback=self.parse_two,dont_filter=True,meta={'type1':typeOneName})


	def parse_two(self,response):
		type1 = response.meta['type1']

		# typeTwoName = response.xpath("//div[@id='listnav_con_c']/dl[1]/dt/text()").extract()[0]
		typeTwo = response.xpath("//div[@class='other_c listnav_con clearfix']/dl//dd//a")	#第二级别的列表
		typeTwo_c = response.xpath("//div[@class='other_c listnav_con clearfix']/dl//dd/h1//a")	#第二级别的列表
		typeTwo=typeTwo+typeTwo_c

		# if type1=="家常菜谱":
		add = response.xpath("//div[@class='listnav_con clearfix']/dl//dd//a")
		if add:
			for i in range(5) :
				add.pop()
			typeTwo=typeTwo+add
		# typeTwoNameList = typeTwo.xpath("./text()").extract()
		# typeTwoUrlList = typeTwo.xpath("./@href").extract()


		# 填写食物一级分类
		# foodItem['TypeOne'] = typeOneName

		for data in typeTwo:	# 遍历二级目录
			typeTwoName = data.xpath("./text()").extract()[0]	#第二级别的名字

			# 填写食物二级分类名称
			# foodItem['TypeTwo'] = typeTwoName

			typeTwoUrl = data.xpath("./@href").extract()[0]		# 第二级别的链接
			yield scrapy.Request(typeTwoUrl,callback = self.parse_page,dont_filter=True,meta={'type1':type1,'type2':typeTwoName})	#请求解析第二级别的连接


	# 遍历二级美食的每页
	def parse_page(self,response):
		# 食物的信息列表
		foods = response.xpath("//a[@class='big']")

		type1 = response.meta['type1']
		type2 = response.meta['type2']

		# 循环访问每个食物的详情页
		for food in foods:
			# 食物详情页的URL
			foodUrl = food.xpath("./@href").extract()[0]
			# 食物的名字
			foodName = food.xpath("./@title").extract()[0]
			# 食物封面图片
			image = food.xpath("./img[@class='img']/@src").extract()[0]
			# 填写食物名字
			# foodItem['name'] = foodName
			# 填写食物图片的uri
			# foodItem['image'] = image
			# 获取食物制作的步数和时长
			stepAndTime = food.xpath(".//div[@class='c2']/ul/li[@class='li1']/text()").extract()[0].split("/")
			# 根据“/”分割步数和时长
			# stepAndTime = stepAndTime.split("/")
			# 填写食物制作步数
			# foodItem['makeStepCount'] = stepAndTime[0]
			# 填写食物制作时长
			# foodItem['makeTime'] = stepAndTime[1]
			# 获取食物风格
			technologyAndFlavor = food.xpath(".//div[@class='c2']/ul/li[@class='li2']/text()").extract()[0].split("/")
			# 根据“/”分割工艺和口味
			# technologyAndFlavor = technologyAndFlavor.split("/")
			# 填写制作工艺
			# foodItem['technology'] = technologyAndFlavor[0]
			# 填写食物口味
			# foodItem['flavor'] = technologyAndFlavor[1]
			# 获取评论和人气
			commentAndPopularity = food.xpath(".//div[@class='c1']/span/text()").extract()[0].split("  ")
			# 根据两个空格分割评论和人气
			# commentAndPopularity = commentAndPopularity.split("  ")
			# 填写评论数
			# foodItem['comment'] = commentAndPopularity[0]
			# 填写人气值
			# foodItem['popularity'] = commentAndPopularity[1]

			yield scrapy.Request(foodUrl,callback = self.parse_url,dont_filter=True,
				meta={'type1':type1,
				'type2':type2,
				'name':foodName,
				'image':image,
				'makeStepCount':stepAndTime[0],
				'makeTime':stepAndTime[1],
				'technology':technologyAndFlavor[0],
				'flavor':technologyAndFlavor[1],
				'comment':commentAndPopularity[0],
				'popularity':commentAndPopularity[1]})

		# 获取下一页的连接
		nextUrl = response.xpath("//a[@class='next']/@href").extract()
		# 如果存在下一页，通过parse_page方法解析下一页
		if nextUrl :
			yield scrapy.Request(nextUrl[0],callback = self.parse_page,dont_filter=True,meta={'type1':type1,'type2':type2})


	# 爬取食物详情页
	def parse_url(self,response):
		foodItem = MeishiItem()
		foodItem['TypeOne'] = response.meta['type1']
		foodItem['TypeTwo'] = response.meta['type2']
		foodItem['name'] = response.meta['name']
		foodItem['technology'] = response.meta['technology']
		foodItem['flavor'] = response.meta['flavor']
		foodItem['makeTime'] = response.meta['makeTime']
		foodItem['makeStepCount'] = response.meta['makeStepCount']
		foodItem['popularity'] = response.meta['popularity']
		foodItem['comment'] = response.meta['comment']
		foodItem['image'] = response.meta['image']

		# 获取详情页的食物益处，得到的数组
		benefit = response.xpath("//dl[@class='yj_tags clearfix']/dt/a/text()").extract()
		# 填写食物益处
		foodItem['benefit']  = benefit
		# 获取食物简介
		describe = response.xpath("//div[@class='materials']/p/text()").extract()[0]
		# 填写食物描述
		foodItem['describe'] = describe
		# 获取主食材
		mainIngredients = response.xpath("//div[@class='c']/h4/a/text()").extract()
		# 获取辅助食材
		auxiliaryIngredients = response.xpath("//ul[@class='clearfix']/li/h4/a/text()").extract()
		# 填写主食材
		foodItem['mainIngredients'] = mainIngredients
		# 填写辅助食材
		foodItem['auxiliaryIngredients'] = auxiliaryIngredients
		yield foodItem